# Description ------------------------------------------------------------------

### This finalizes cleaning of the baseline vendor survey variables needed 
### for analysis for the TAD Impact Evaluation Project by making  it easier to
### work with in R; merges in treatment status; then 
### saves cleaned version as an .Rdata file.

# Settings ---------------------------------------------------------------------
data_path <- 'data/1_raw/'
clean_path <- 'data/2_clean/'

# Packages ---------------------------------------------------------------------
#if packages are not installed, they must be installed with:
# install.packages("package_name"); The name of package must be in quotes
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
library(haven)
library(labelled)
library(hms)

# Convenience Functions --------------------------------------------------------
source("scripts/0_functions/functions_cleaning.R")

# Cleaning ---------------------------------------------------------------------
#read in block id
block_ids <- read_dta(paste0(data_path, "block_ids.dta")) %>% 
  mutate_if(is.labelled, to_factor)

#read in treatment assignments
treatment_groups <- read_csv(paste0(clean_path, 'treatment_groups.csv'))

#baseline vendor survey data
vendor_base <- read_dta(paste0(data_path, 
                               "marketvendor_BASELINEFINAL_noID_clean_v5.dta"))

#fix one outcome variable
vendor_base <- vendor_base %>% 
  mutate(no_rcpt_when_pay = neg_to_na(tc2_16_clean))
vendor_base <- vendor_base %>%
  remove_value_labels(no_rcpt_when_pay = -88) %>% 
  remove_value_labels(no_rcpt_when_pay = -99)

vendor_base <- mutate_if(vendor_base, is.labelled, to_factor)
vendor_base <- left_join(vendor_base, 
                         treatment_groups,
                         by = c("market" = "Market", "district" = "District"))
#drop Kasichi observation
vendor_base <- filter(vendor_base, market != "Kasichi")

#create BU and TD treatment indicators
vendor_base <- mutate(vendor_base,
                      BU_treat = str_detect(treatment_status, "BU|BOTH") %>%
                        as.numeric(),
                      TD_treat = str_detect(treatment_status, "TD|BOTH") %>%
                        as.numeric(),
                      BU = ifelse(BU_treat == 1 & TD_treat == 0,
                                  1, 0),
                      TD = ifelse(BU_treat == 0 & TD_treat == 1,
                                  1, 0),
                      Both = ifelse(BU_treat == 1 & TD_treat == 1,
                                    1, 0))

#merge in treatment assignment block ids
vendor_base <- left_join(vendor_base, block_ids,
                         by = c("market" = "market", "district" = "district"))

#turn outcome vars from factors to numeric
vendor_base <- vendor_base %>% 
  mutate(tr1_num = as.numeric(tr1_clean),
         tr2_num = as.numeric(tr2_clean),
         tr9e_num = as.numeric(tr9e_clean),
         ms1_num = as.numeric(ms1_clean),
         ms4_num = as.numeric(ms4_clean),
         ms3_num = as.numeric(ms3_clean),
         ms5_num = as.numeric(ms5_clean),
         ms6_num = as.numeric(ms6_clean),
         satisfaction_dev_num = as.numeric(satisfaction_dev),
         tax_morale_num = as.numeric(tax_morale),
         tc2_4b_num = as.numeric(tc2_4b_clean),
         tc5a_num = as.numeric(tc5a_clean),
         tc5b_num = as.numeric(tc5b_clean),
         tc2_15b_num = as.numeric(tc2_15b_clean),
         ms_average = ms1_num + ms3_num + ms4_num +
           ms5_num + ms6_num,
         ms_average = ms_average/5,
         pay_even_disagree = ifelse(tax_morale == 
                                      levels(tax_morale)[2], 1, 0),
         #for merging with endline, sup has to be made char
         sup = as.character(sup),
         #for merging with endline, enumerator has to be made char
         enum = as.character(enum),
         female = as.numeric(female) - 1,
         no_rcpt_when_pay_num = as.numeric(no_rcpt_when_pay),
         sp1_yn = ifelse(is.na(sp1), NA, ifelse(grepl("Yes", sp1), 1, 0)),
         sp3_yn = ifelse(is.na(sp3), NA, ifelse(grepl("Yes", sp3), 1, 0)),
         sell_in_othr_mkts = ifelse(sp5 == "Yes", 1, 0))

#fixing issues with yrs in mkt variable
vendor_base$yrs_in_mkt_fix <- ifelse(vendor_base$yrs_in_mkt > 100,
                                     NA, 
                                     vendor_base$yrs_in_mkt)

#making education numeric
vendor_base$educ_num <- as.numeric(vendor_base$education) - 1
vendor_base$educ_none <- 1*(vendor_base$educ_num == 0)
vendor_base <- vendor_base %>% 
  mutate(educ_cat = ifelse(educ_num > 0 & educ_num <= 9, 1,
                           ifelse(educ_num > 9 & educ_num <= 13, 2,
                                  ifelse(educ_num > 13, 3, educ_num))))

#make receipt_age variable, receipt official variable (if had receipt and it 
#looked official), fixing receipt variable
vendor_base <- vendor_base %>% 
  mutate(receipt_age_orig = as.numeric(difftime(strptime(starttime, 
                                                    format = "%Y-%m-%d", 
                                                    tz = "UTC"),
                                           receipt_date,
                                           units = "days")), 
         #this is the receipt age as determined purely by survey data
         #the more positive, the older the receipt; negative values are "impossible" values
         #fixing receipt age in terms of years turn all years for impossible values into 2017 (likely enum error)
         receipt_date_orig = receipt_date, #save old receipt_date
         receipt_yr = year(receipt_date),
         receipt_month = month(receipt_date),
         receipt_day = day(receipt_date),
         receipt_year = if_else(receipt_yr > 2017, 2017, receipt_yr),
         receipt_yr = NULL)

#fixing receipt date
vendor_base <- vendor_base |> 
  mutate(receipt_date = if_else(!is.na(receipt_date_orig),
                                make_date(year = receipt_year,
                                          month = receipt_month,
                                          day = receipt_day),
                                NA))

vendor_base <- vendor_base %>% 
  mutate(receipt_age = as.numeric(difftime(strptime(starttime, 
                                                    format = "%Y-%m-%d",
                                                    tz = "UTC"),
                                           receipt_date,
                                           units = "days")),
         #receipt age is otherwise ok at baseline
         #retain old recent_receipt vars
         recent_receipt_7_v2a = recent_receipt_7,
         recent_receipt_10_v2a = recent_receipt_10,
         #make recent_receipt_* vars drop nonsensical values
         recent_receipt_7 = ifelse(is.na(receipt_date), 0, 
                                   ifelse(receipt_age <= 7 &
                                            receipt_age >= 0, 1, 0)),
         recent_receipt_7 = ifelse(is.na(receipt_date), recent_receipt_7, 
                                   ifelse(receipt_age < 0,
                                          NA, recent_receipt_7)),
         recent_receipt_10 = ifelse(is.na(receipt_date), 0, 
                                    ifelse(receipt_age <= 10 &
                                             receipt_age >= 0, 1, 0)),
         recent_receipt_10 = ifelse(is.na(receipt_date), recent_receipt_10, 
                                        ifelse(receipt_age < 0,
                                               NA, recent_receipt_10)),
         recent_receipt_7_ofcl = ifelse(is.na(recent_receipt_7), NA,
                                        ifelse(recent_receipt_7 == 1 &
                                          tc2a == "Yes", 1, 0)),
         recent_receipt_10_ofcl = ifelse(is.na(recent_receipt_10), NA,
                                         ifelse(recent_receipt_10 == 1 &
                                                  tc2a == "Yes", 1, 0)),
         #make version of recent receipt where we count "impossible" values as 0s
         recent_receipt_7_v2b = ifelse(is.na(receipt_date), 0, 
                                       ifelse(receipt_age <= 7 &
                                                receipt_age >= 0, 1, 0)),
         recent_receipt_10_v2b = ifelse(is.na(receipt_date), 0, 
                                        ifelse(receipt_age <= 10 &
                                                 receipt_age >= 0, 1, 0)),
         #make receipt_check 1/0
         receipt_shown = ifelse(is.na(receipt_check), NA,
                                as.numeric(receipt_check == "Receipt Available"))
         )

# behavioral outcomes
# baseline
vendor_base <- vendor_base %>% 
  mutate(behavioral_cl = factor(behavioral, levels = c("No", "Yes")),
         petition = if_else(behavioral_cl == "Yes", 1, 0),
         b1_cl = factor(b1, levels = c("No", "Yes")),
         petition_wname = if_else(b1_cl == "Yes", 1, 0),
         kw600 = if_else(b2a == "600 Kw in one week", 1, 0))

# Saving -----------------------------------------------------------------------
#save as RData file
save(vendor_base, file = paste0(clean_path, "vendor_base.RData"))
